////////////////////////////////////////////////////////////////////////////////////////////////////
// MNRT License
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 Mathias Neumann, www.maneumann.com.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are 
// permitted provided that the following conditions are met:
// 
// 1. Redistributions of source code must retain the above copyright notice, 
//    this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, 
//    this list of conditions and the following disclaimer in the documentation and/or 
//    other materials provided with the distribution.
//
// 3. Neither the name Mathias Neumann, nor the names of contributors may be 
//    used to endorse or promote products derived from this software without 
//    specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
// POSSIBILITY OF SUCH DAMAGE.
////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////
/// \file	GPU\kdtree_chunklist.cu
///
/// \brief	Kernels for kd-tree construction, specifically chunk list construction.
/// \author	Mathias Neumann
/// \date	01.04.2010
/// \ingroup	kdtreeCon
////////////////////////////////////////////////////////////////////////////////////////////////////

#include "MNUtilities.h"
#include "kd-tree/KDKernelDefs.h"

#include "mncudautil_dev.h"

// Defined in kdtree.cu
extern cudaDeviceProp f_DevProps;


////////////////////////////////////////////////////////////////////////////////////////////////////
/// \KERNELS
////////////////////////////////////////////////////////////////////////////////////////////////////
//@{

////////////////////////////////////////////////////////////////////////////////////////////////////
/// \fn	__global__ void kernel_GetChunkCounts(uint* d_numElemsNode, uint numNodes,
/// 	uint* d_outChunkCounts)
///
/// \brief	Computes chunk count for each node.
///
///			The chunk count is defined as the number of chunks required for a given node. It is
///			computed by taking into account that a chunk may only contain up to ::KD_CHUNKSIZE
///			elements.
///
/// \author	Mathias Neumann
/// \date	16.02.2010
///
/// \param [in]		d_numElemsNode		Number of elements in each kd-tree node. 
/// \param	numNodes					Number of nodes. 
/// \param [out]	d_outChunkCounts	Chunk count for each kd-tree node.
////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void kernel_GetChunkCounts(uint* d_numElemsNode, uint numNodes, uint* d_outChunkCounts)
{
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;

	if(idx < numNodes)
	{
		uint numElems = d_numElemsNode[idx];
		// NOTE: These division and modulo operation are optimized by the compiler since
		//		 KD_CHUNKSIZE is power of 2.
		d_outChunkCounts[idx] = numElems / KD_CHUNKSIZE + ((numElems % KD_CHUNKSIZE) ? 1 : 0);
	}
}


////////////////////////////////////////////////////////////////////////////////////////////////////
/// \fn	__global__ void kernel_GenerateChunks(uint* d_numElemsNode, uint* d_idxFirstElemNode,
/// 	uint numNodes, uint* d_offsets, KDChunkList lstChunks)
///
/// \brief	Generates chunks for kd-tree nodes. 
///
///			Chunks are generated by iterative creation within node borders. All nodes are handled
///			in parallel. Provided offsets permit this approach.
///
/// \warning Heavy uncoalesced memory access. 
/// \note	Remember to set chunk count. This cannot be done within this kernel.
///
/// \author	Mathias
/// \date	16.02.2010
///
/// \param [in]		d_numElemsNode		Number of elements in each kd-tree node.
/// \param [in]		d_idxFirstElemNode	Index of first node element in element node association list
///										(for each node). 
/// \param	numNodes					Number of nodes. 
/// \param [in]		d_offsets			Offsets that define where to place the generated chunks
/// 									for each node.
/// \param	lstChunks					The generated chunk list.
////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void kernel_GenerateChunks(uint* d_numElemsNode, uint* d_idxFirstElemNode, uint numNodes, 
									  uint* d_offsets, KDChunkList lstChunks)
{
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;

	if(idx < numNodes)
	{
		uint idxStart = d_offsets[idx];
		uint firstElem = d_idxFirstElemNode[idx];
		uint numElems = d_numElemsNode[idx];

		uint curTri = 0, i = 0;
		while(curTri < numElems)
		{
			// Generate chunk.
			lstChunks.d_idxNode[idxStart + i] = idx;
			lstChunks.d_idxFirstElem[idxStart + i] = firstElem + curTri;

			uint tris = min(KD_CHUNKSIZE, numElems-curTri);
			lstChunks.d_numElems[idxStart + i] = tris;
			curTri += tris;
			i++;
		}
	}
}

////////////////////////////////////////////////////////////////////////////////////////////////////
/// \fn	__global__ void kernel_CountElemsPerChunk(KDChunkList lstChunks, uint* d_validFlags,
/// 	uint* d_outCountPerChunk)
///
/// \brief	Counts valid elements per chunk.
/// 		
/// 		Exploits chunk size to perform reduction for each chunk's range of valid flags. 
///
/// \note	Required shared memory per thread block of size N: 4 * N + 8 bytes.
///
/// \author	Mathias Neumann
/// \date	18.02.2010 
///	\see	::dev_ReduceFast()
///
/// \param	lstChunks					The chunks list. 
/// \param [in]		d_validFlags		The valid flags binary 0/1 array. Contains an entry for each
///										element, i.e. is of equal format as the ENA of the
///										corresponding node list. Only elements with flag = 1 are
///										valid.
/// \param [out]	d_outCountPerChunk	Computed count of valid elements for each chunk. 
////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void kernel_CountElemsPerChunk(KDChunkList lstChunks, uint* d_validFlags, uint* d_outCountPerChunk)
{
	uint chk = MNCUDA_GRID2DINDEX;

	__shared__ uint s_numElemsChunk;
	__shared__ uint s_idxFirstElem;
	if(threadIdx.x == 0)
	{
		s_numElemsChunk = lstChunks.d_numElems[chk];
		s_idxFirstElem = lstChunks.d_idxFirstElem[chk];
	}
	__syncthreads();

	// Copy chunks's flags into shared memory. Preload and -add two values directly.
	__shared__ uint smem[KD_CHUNKSIZE];
	uint v1 = 0, v2 = 0;
	if(threadIdx.x < s_numElemsChunk)
		v1 = d_validFlags[s_idxFirstElem + threadIdx.x];
	if(threadIdx.x+blockDim.x < s_numElemsChunk)
		v2 = d_validFlags[s_idxFirstElem + threadIdx.x+blockDim.x];
	smem[threadIdx.x] = v1 + v2;
	__syncthreads();

	// Now perform reduction on chunks's flags.
	uint res = dev_ReduceFast<uint, KD_CHUNKSIZE/2, ReduceOperatorTraits<uint, MNCuda_ADD>>(smem);
	if(threadIdx.x == 0)
		d_outCountPerChunk[chk] = res;
}

//@}
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////
/// \WRAPPERS
////////////////////////////////////////////////////////////////////////////////////////////////////
//@{

/// Wraps kernel_GetChunkCounts() kernel call.
extern "C"
void KernelKDGetChunkCounts(uint* d_numElemsNode, uint numNodes, uint* d_outChunkCounts)
{
	dim3 blockSize = dim3(256, 1, 1);
	dim3 gridSize = dim3(MNCUDA_DIVUP(numNodes, blockSize.x), 1, 1);
	kernel_GetChunkCounts<<<gridSize, blockSize>>>(d_numElemsNode, numNodes, d_outChunkCounts);
	MNCUDA_CHECKERROR;
}

/// Wraps kernel_GenerateChunks() kernel call.
extern "C"
void KernelKDGenerateChunks(uint* d_numElemsNode, uint* d_idxFirstElemNode, uint numNodes, 
						uint* d_offsets, KDChunkList& lstChunks)
{
	dim3 blockSize = dim3(256, 1, 1);
	dim3 gridSize = dim3(MNCUDA_DIVUP(numNodes, blockSize.x), 1, 1);
	kernel_GenerateChunks<<<gridSize, blockSize>>>(d_numElemsNode, d_idxFirstElemNode, numNodes,
		d_offsets, lstChunks);
	MNCUDA_CHECKERROR;
}

/// Wraps kernel_CountElemsPerChunk() kernel call.
extern "C"
void KernelKDCountElemsPerChunk(const KDChunkList& lstChunks, uint* d_validFlags, uint* d_outCountPerChunk)
{
	// Note that we use half the chunk size here. This is a reduction optimization.
	dim3 blockSize = dim3(KD_CHUNKSIZE/2, 1, 1);
	dim3 gridSize = MNCUDA_MAKEGRID2D(lstChunks.numChunks, f_DevProps.maxGridSize[0]);
	kernel_CountElemsPerChunk<<<gridSize, blockSize>>>(lstChunks, d_validFlags, d_outCountPerChunk);
	MNCUDA_CHECKERROR;
}

//@}
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////